# -*- coding: utf-8 -*-
"""
For preprocessing the India data

"""

import sys
import argparse
import string
import codecs
from os import path
import random
from collections import defaultdict
import numpy as np
import operator
import pandas as pd
import glob

random.seed(1776)


def new_county_info():
    return {"totalT": 0.0, "totalNotT": 0.0, "N": 0.0, "W1": 0.0, "W2": 0.0, "totalX": 0, "totalNotX": 0}

def to_float_or_default_field_value(field_value):
    if field_value == "Suppressed":
        return 0.0
    return float(field_value)

def dd(n, d):
    """
    default division
    :return: division, else 0.0
    """
    return n / d if d else 0.0


def ddf(n, d):
    """
    default division, converting both arguments to float
    :return: division, else 0.0
    """
    return float(n) / float(d) if d else 0.0


def get_csv_lines_bygender_attendance_urbanandrural(input_dir, geo_label):
    # ["X", "notX", "T", "notT", "W1", "W2", "N"])
    # Note that this is literacy among people attending educational institutions
    # X = male
    # total (rural and/or urban)
    # T = Literacy

    assert geo_label in [u'Total', u'Rural', u'Urban']
    diff = []
    lines = []
    lines.append(",".join(["X", "notX", "T", "notT", "W1", "W2", "N"]) + "\n")
    input_line_id = 0
    for input_file in glob.glob(input_dir + "/*.xls"):

        df = pd.read_excel(input_file)

        print input_line_id, input_file
        input_line_id += 1

        for index, row in df.iterrows():
            if row[u"Area Name"].startswith("District") and row[u'Age-Group'] == u"All ages" \
                    and row[u'Total/ Rural/ Urban'] == geo_label and int(row[u"Total Population - Persons"]) != 0:

                n = int(row[u"Total Population - Persons"])
                x = ddf(row[u'Total Population - Males'], n)
                notx = ddf(row[u'Total Population - Females'], n)
                assert np.isclose(notx, 1 - x)
                t = ddf(row[u'Population Attending Educational Institutions - Persons'], n)
                nott = ddf(n - int(row[u'Population Attending Educational Institutions - Persons']), n)
                assert np.isclose(nott, 1 - t)
                w1 = ddf(row[u'Population Attending Educational Institutions - Males'],
                         row[u'Total Population - Males'])
                w2 = ddf(row[u'Population Attending Educational Institutions - Females'],
                         row[u'Total Population - Females'])

                if x == 0.0 or t == 0.0:
                    print "Zero warning", input_file

                # print [x, notx, t, nott, w1, w2, n]
                lines.append(",".join(["%f" % val for val in [x, notx, t, nott, w1, w2, n]]) + "\n")

                diff.append(w1 - w2)

        print "Average difference w1 - w2:", np.mean(diff)
    return lines

def get_csv_lines_bygender_literacy_urbanandrural(input_dir, geo_label):
    # ["X", "notX", "T", "notT", "W1", "W2", "N"])
    # Note that this is literacy among people attending educational institutions
    # X = male
    # total (rural and/or urban)
    # T = Literacy

    assert geo_label in [u'Total', u'Rural', u'Urban']
    diff = []
    lines = []
    lines.append(",".join(["X", "notX", "T", "notT", "W1", "W2", "N"]) + "\n")
    input_line_id = 0
    for input_file in glob.glob(input_dir + "/*.xls"):

        df = pd.read_excel(input_file)

        print input_line_id, input_file
        input_line_id += 1

        for index, row in df.iterrows():
            if row[u"Area Name"].startswith("District") and row[u'Age-Group'] == u"All ages" \
                    and row[u'Total/ Rural/ Urban'] == geo_label and int(row[u"Total Population - Persons"]) != 0:
                assert int(row[u"Population Attending Educational Institutions - Persons"]) == int(
                    row[u'Illiterates - Persons']) + int(row[u'Literates - Persons'])
                n = int(row[u'Literates - Persons']) + int(row[u'Illiterates - Persons'])
                x = ddf(int(row[u'Literates - Males']) + int(row[u'Illiterates - Males']), n)
                notx = ddf(int(row[u'Literates - Females']) + int(row[u'Illiterates - Females']), n)
                assert np.isclose(notx, 1 - x)
                t = ddf(row[u'Literates - Persons'], n)
                nott = ddf(row[u'Illiterates - Persons'], n)
                assert np.isclose(nott, 1 - t)
                w1 = ddf(row[u'Literates - Males'], int(row[u'Literates - Males']) + int(row[u'Illiterates - Males']))
                w2 = ddf(row[u'Literates - Females'],
                         int(row[u'Literates - Females']) + int(row[u'Illiterates - Females']))

                if x == 0.0 or t == 0.0:
                    print "Zero warning", input_file

                # print [x, notx, t, nott, w1, w2, n]
                lines.append(",".join(["%f" % val for val in [x, notx, t, nott, w1, w2, n]]) + "\n")

                diff.append(w1 - w2)

        print "Average difference w1 - w2:", np.mean(diff)
    return lines




def save_lines(filename_with_path, list_of_lists):   
    with codecs.open(filename_with_path, "w", encoding="utf-8") as f:
        f.writelines(list_of_lists)


                                 
def main(arguments):

    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--input_dir', help="input_file")
    parser.add_argument('--output_dir', help="output_file")


    args = parser.parse_args(arguments)

    input_dir = args.input_dir
    output_dir = args.output_dir

    for geo_label in [u'Total', u'Rural', u'Urban']:
        label = "india_bygender_edu_attendance_%s.csv" % geo_label
        print label
        lines = get_csv_lines_bygender_attendance_urbanandrural(input_dir, geo_label)
        save_lines(path.join(output_dir, label), lines)


    for geo_label in [u'Total', u'Rural', u'Urban']:
        label = "india_bygender_edu_literacy_%s.csv" % geo_label
        print label
        lines = get_csv_lines_bygender_literacy_urbanandrural(input_dir, geo_label)
        save_lines(path.join(output_dir, label), lines)




if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))

